{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import scipy\n",
    "import nltk\n",
    "import sklearn\n",
    "import random\n",
    "import re\n",
    "from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.multiclass import OneVsRestClassifier\n",
    "from sklearn.metrics import f1_score, precision_score, recall_score\n",
    "from sklearn.linear_model import LogisticRegression, LinearRegression\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.decomposition import PCA, RandomizedPCA\n",
    "from sklearn.ensemble import RandomForestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package reuters to /home/felipe/nltk_data...\n",
      "[nltk_data]   Package reuters is already up-to-date!\n",
      "[nltk_data] Downloading package punkt to /home/felipe/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.download('reuters')\n",
    "nltk.download('punkt') # needed for tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dataset = nltk.corpus.reuters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fileids = dataset.fileids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction\n",
    "corpus_train = []\n",
    "corpus_test = []\n",
    "for fileid in dataset.fileids():\n",
    "    document = dataset.raw(fileid)\n",
    "    if re.match('training/',fileid):\n",
    "        corpus_train.append(document)\n",
    "    else:\n",
    "        corpus_test.append(document)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def preprocessor(string):\n",
    "    repl = re.sub('&lt;','',string)\n",
    "    return repl.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(\n",
    "                min_df=10, # tweaking this parameter reduces the length of the feature vector\n",
    "                strip_accents='ascii',\n",
    "                preprocessor=preprocessor,\n",
    "                stop_words='english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# need to use both corpuses for fitting because otherwise there may be words that only occur in the\n",
    "# training set or in the test set\n",
    "full_corpus = corpus_train + corpus_test\n",
    "vectorizer.fit(full_corpus)\n",
    "\n",
    "X_train_counts = vectorizer.transform(corpus_train)\n",
    "X_test_counts = vectorizer.transform(corpus_test)\n",
    "X_full_counts = vectorizer.transform(full_corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "transformer = TfidfTransformer()\n",
    "# again, we need to fit the transformer to all documents (train and test)\n",
    "transformer.fit(X_full_counts)\n",
    "\n",
    "X_train_tfidf = transformer.transform(X_train_counts)\n",
    "X_test_tfidf = transformer.transform(X_test_counts)\n",
    "X_full_tfidf = transformer.transform(X_full_counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8.13 s, sys: 77.4 ms, total: 8.21 s\n",
      "Wall time: 8.1 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "Y_train = []\n",
    "Y_test = []\n",
    "\n",
    "for (idx,fileid) in enumerate(dataset.fileids()):    \n",
    "    categories = '*'.join(dataset.categories(fileid))\n",
    "\n",
    "    if re.match('training/',fileid):\n",
    "        Y_train.append(categories)\n",
    "    else:\n",
    "        Y_test.append(categories)\n",
    "\n",
    "series_train = pd.Series(Y_train)\n",
    "Y_train_df = series_train.str.get_dummies(sep='*')\n",
    "\n",
    "series_test = pd.Series(Y_test)\n",
    "Y_test_df = series_test.str.get_dummies(sep='*')\n",
    "\n",
    "Y_train = Y_train_df.values\n",
    "Y_test = Y_test_df.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 10s, sys: 542 ms, total: 1min 10s\n",
      "Wall time: 1min 56s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "clf = RandomForestClassifier(n_estimators=25)\n",
    "meta_clf = OneVsRestClassifier(clf,n_jobs=-1)\n",
    "\n",
    "meta_clf.fit(X_train_tfidf.toarray(),Y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "Y_pred = meta_clf.predict(X_test_tfidf.toarray())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.76837593268772808"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f1_score(Y_test,Y_pred,average='micro')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
